import os, json
from pathlib import Path
from tqdm import tqdm


if __name__ == "__main__":
    old_anns_p = Path(os.getenv("HOME", "")) / "datasets" / "coco_karpathy_split" / "annotations" / "coco_karpathy_train_gt.json"
    imgs_p = Path(os.getenv("DATASETS", "")) / "coco_karpathy_split" / "images" / "train"
    assert old_anns_p.exists()
    assert imgs_p.exists()
    imgs_names = [img_p.name for img_p in imgs_p.iterdir()]
    new_anns_p = old_anns_p.parent / "coco_karpathy_train_gt_for_plain_conv_llava.json"
    old_anns = json.load(open(old_anns_p, "r"))["annotations"]
    new_anns = []

    for old_ann in tqdm(old_anns):
        new_ann = {'id': '', 'image': '', 'conversations': [{'from': 'human', 'value': '<image>'}, {'from': 'gpt', 'value': ''}]}
        new_ann['id'] += f"{old_ann['id']}"
        if f"COCO_train2014_{old_ann['image_id']:012}.jpg" in imgs_names:
            img_name = f"COCO_train2014_{old_ann['image_id']:012}.jpg"
        elif f"COCO_val2014_{old_ann['image_id']:012}.jpg" in imgs_names:
            img_name = f"COCO_val2014_{old_ann['image_id']:012}.jpg"
        else:
            print(old_ann)
            raise ValueError("image name is wrong")
        new_ann['image'] += img_name
        new_ann['conversations'][-1]['value'] += old_ann['caption']
        new_anns.append(new_ann)

    with open(new_anns_p, "w") as f:
        json.dump(new_anns, f)

